This ipython file is the project by Hongyi Tang and Weijian Li for course 12752. There are four ipython files in the project in total. Each file consist of one cluster analysis task. In this file, the cluster analysis is demonstrated to 4 building types.



In [2]:

    
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pickle

%matplotlib inline

Please download the dataset and change the file path.



In [3]:

    
# Read in CBECS data
data = pd.DataFrame.from_csv('C:/F16-12-752-master/projects/thongyi_weijian1/data/CBECS.csv') 
data.tail()









    Out[3]:






  
    
      
      REGION
      CENDIV
      PBA
      FREESTN
      SQFT
      SQFTC
      WLCNS
      RFCNS
      RFCOOL
      RFTILT
      ...
      FKCLBTU
      FKWTBTU
      FKCKBTU
      FKOTBTU
      DHHTBTU
      DHCLBTU
      DHWTBTU
      DHCKBTU
      DHOTBTU
      PUBCLIM
    
    
      PUBID
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      6716
      3
      5
      14
      1.0
      108000
      7
      1
      6
      2
      1
      ...
      0.0
      0.0
      0.0
      0.0
      NaN
      NaN
      NaN
      NaN
      NaN
      2
    
    
      6717
      3
      7
      5
      1.0
      1700
      2
      5
      5
      2
      2
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      2
    
    
      6718
      2
      3
      26
      1.0
      2000
      2
      1
      4
      2
      2
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      1
    
    
      6719
      1
      2
      12
      1.0
      19250
      4
      1
      4
      2
      1
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      2
    
    
      6720
      3
      5
      14
      1.0
      142000
      7
      1
      1
      1
      2
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      2
    
  

5 rows × 1118 columns

In this time, four building types are selected which are office, pubilc assembly, inpatient health center and service.



In [624]:

    
energydata=pd.DataFrame()
type_B=[2,13,16,26] # office, inpatient health care, service and public assembly 
type_C=[1,3,4,5,6,7,8,9,10,11,12,14,15,17,18,19,20,21,22,23,24,25,91]
data_type=data
data_type=data_type[data_type.NGUSED!=2]
for i in type_C:
    data_type=data_type[data_type.PBA!=i]
energydata['Building Type']=data_type.PBA
index=['ELBTU','NGBTU','ELVNBTU','NGHTBTU']
for i in index:
    energydata[i]=data_type[i]/data_type.SQFT



In [625]:

    
energydata = energydata.dropna(how='any')
energydata = energydata[~(energydata == 0).any(axis=1)]
PBA1=energydata['Building Type'].unique()
count=[]
for i in PBA1:
    count.append([energydata[energydata['Building Type']==i].shape[0],i])
count









    Out[625]:





[[714, 2], [244, 26], [226, 13], [283, 16]]



In [626]:

    
# energydata[energydata['Building Type']==2].boxplot()
fig1 = plt.figure(figsize=(15,15))
times=1
data_seperate=[]
# energydata[energydata['Building Type']==type_B[1]]
for i in range(len(type_B)):
    x=energydata[energydata['Building Type']==type_B[i]]
    x=x.drop(x.columns[0],axis=1)
    data_seperate.append(x) 
for i in range(len(type_B)):
    plt.subplot(len(type_B),2,times)
    data_seperate[i].boxplot()
    times=times+1
    plt.title(type_B[i])
    plt.ylim(0,300)









    



C:\Users\thong\Anaconda3\lib\site-packages\ipykernel\__main__.py:12: FutureWarning: 
The default value for 'return_type' will change to 'axes' in a future release.
 To use the future behavior now, set return_type='axes'.
 To keep the previous behavior and silence this warning, set return_type='dict'.



In [627]:

    
y=pd.DataFrame()
for i in range(len(type_B)):
    y=y.append(data_seperate[i])
X=y.as_matrix().astype(np.float32)



In [628]:

    
from sklearn.cluster import KMeans
num_clust = 4
clusters = KMeans(n_clusters=num_clust).fit(X)
cluster_assignments = clusters.predict(X)
# plt.subplot(num_clust+1,1,1)
# plt.plot(cluster_assignments[:150])
# plt.ylim([0.2,1.1])
fig2 = plt.figure(figsize=(15,15))
for cluster_id in range(len(clusters.cluster_centers_)):
    plt.subplot(num_clust,2,cluster_id+1)
    cluster_members = X[cluster_assignments==cluster_id,:]
    print(len(cluster_members))
    for i in range(len(cluster_members)):
        plt.plot(cluster_members[i,:], color='grey', lw='0.1')
    plt.plot(clusters.cluster_centers_[cluster_id,:], color='k', lw='1')
#     plt.ylim([-2000,2000])

There is a cluster only contains 12 samples. We treated them as weird data sample and delete them.



In [629]:

    
y['assignment']=cluster_assignments
y=y[y.assignment!=1] # The weird samples can locate in two different clusters 
# y=y[y.assignment!=3] #and the cluster number may be different every time run the code.
y=y.drop(y.columns[4],axis=1)
X=y.as_matrix().astype(np.float32)









    Out[629]:






  
    
      
      ELBTU
      NGBTU
      ELVNBTU
      NGHTBTU
    
    
      PUBID
      
      
      
      
    
  
  
    
      9
      59.424400
      19.931467
      17.621520
      16.893707
    
    
      11
      42.062146
      20.280000
      18.361463
      19.146244
    
    
      18
      10.216667
      8.183000
      2.971167
      8.183000
    
    
      20
      37.380282
      22.818521
      14.557141
      21.278775
    
    
      36
      30.185778
      8.474478
      13.816567
      7.493933
    
    
      51
      41.638982
      29.714506
      14.336681
      20.583518
    
    
      69
      111.319939
      3.025333
      33.627515
      0.912000
    
    
      100
      25.897111
      17.766667
      7.408889
      17.686444
    
    
      108
      20.873004
      29.933220
      6.394295
      27.467557
    
    
      115
      44.951358
      17.417264
      9.733698
      15.784358
    
    
      126
      23.904455
      51.566818
      7.272182
      50.182545
    
    
      127
      27.743737
      3.023232
      6.886364
      2.647980
    
    
      136
      35.718333
      46.894167
      8.124167
      45.752500
    
    
      144
      15.735960
      18.139394
      5.893737
      18.139394
    
    
      152
      10.443100
      7.113500
      2.025250
      5.181050
    
    
      165
      352.486637
      52.439644
      61.461287
      42.351813
    
    
      175
      34.545566
      33.377292
      7.076623
      33.377292
    
    
      222
      16.480294
      2.421816
      4.573518
      1.926827
    
    
      235
      4.984667
      5.070400
      2.002533
      5.070400
    
    
      237
      63.594504
      6.742274
      10.754800
      6.742274
    
    
      248
      42.843948
      64.437343
      9.268919
      64.437343
    
    
      264
      9.204675
      6.999475
      2.561925
      1.634537
    
    
      282
      92.075067
      24.982667
      29.381333
      22.995467
    
    
      293
      43.000548
      3.341918
      12.905205
      0.055342
    
    
      297
      98.516132
      85.183974
      28.013116
      50.586268
    
    
      299
      3.289518
      0.210000
      1.287470
      0.210000
    
    
      300
      20.850000
      34.423333
      5.026667
      34.423333
    
    
      306
      58.323408
      0.413944
      15.147459
      0.332946
    
    
      315
      89.741871
      37.877167
      30.037748
      29.966045
    
    
      325
      41.693680
      10.910120
      12.637040
      8.450040
    
    
      ...
      ...
      ...
      ...
      ...
    
    
      5848
      64.369375
      6.918750
      6.246875
      6.640000
    
    
      5849
      57.696226
      103.660377
      4.659434
      92.076604
    
    
      5850
      77.928125
      406.156250
      15.178125
      21.138750
    
    
      5851
      7.815172
      8.853966
      0.614310
      8.064828
    
    
      5862
      67.007342
      13.052658
      4.508608
      13.052658
    
    
      5892
      42.649400
      65.805000
      6.659400
      65.805000
    
    
      5932
      2.205531
      6.393438
      0.284469
      3.015219
    
    
      5993
      25.293000
      39.726095
      2.705190
      38.623524
    
    
      6008
      45.440969
      98.860111
      9.686133
      98.282004
    
    
      6059
      1.691333
      7.141000
      0.154000
      7.141000
    
    
      6071
      27.861905
      15.863175
      2.347778
      15.863175
    
    
      6117
      43.682647
      2.532353
      3.482941
      2.532353
    
    
      6121
      48.427692
      40.369231
      3.295897
      40.369231
    
    
      6123
      24.650750
      171.610750
      6.232750
      149.945000
    
    
      6155
      50.840800
      44.300600
      6.515800
      44.300600
    
    
      6162
      19.844211
      14.155789
      1.085579
      14.155789
    
    
      6186
      36.641948
      32.107792
      7.678753
      32.107792
    
    
      6189
      14.070000
      13.564333
      1.071667
      13.564333
    
    
      6221
      2.456500
      60.885000
      0.263000
      60.885000
    
    
      6223
      21.434800
      9.553000
      1.835000
      9.553000
    
    
      6274
      31.145769
      39.202308
      5.429269
      38.919846
    
    
      6315
      293.539286
      137.862857
      20.013571
      137.862857
    
    
      6373
      44.377778
      10.790972
      3.864583
      10.463333
    
    
      6378
      21.296963
      95.476852
      1.466963
      95.279963
    
    
      6443
      18.864103
      48.201282
      1.785128
      48.201282
    
    
      6481
      17.153357
      10.052357
      2.103714
      6.549000
    
    
      6489
      37.735964
      42.591564
      4.069527
      42.019200
    
    
      6526
      13.083524
      19.279810
      2.185619
      19.279810
    
    
      6694
      17.754971
      30.111571
      3.015971
      30.111571
    
    
      6718
      104.144500
      135.966500
      4.339500
      127.668000
    
  

1455 rows × 4 columns

After deleting the weired data sample, the cluster analysis result is much reasonable.



In [630]:

    
num_clust = 4
clusters = KMeans(n_clusters=num_clust).fit(X)
cluster_assignments = clusters.predict(X)
# plt.subplot(num_clust+1,1,1)
# plt.plot(cluster_assignments[:150])
# plt.ylim([0.2,1.1])
fig2 = plt.figure(figsize=(20,15))
for cluster_id in range(len(clusters.cluster_centers_)):
    plt.subplot(num_clust+1,2,cluster_id+1)
    cluster_members = X[cluster_assignments==cluster_id,:]
    print(len(cluster_members))
    for i in range(len(cluster_members)):
        plt.plot(cluster_members[i,:], color='grey', lw='0.1')
    plt.plot(clusters.cluster_centers_[cluster_id,:], color='k', lw='1')



In [631]:

    
cluster_assignments









    Out[631]:





array([0, 0, 0, ..., 0, 0, 1])



In [632]:

    
y['assignment']=cluster_assignments
y=y.join(data['PBA'],how='inner')

The connection between assignment and building type need to be found by visually judging. The connecting can be different every time the file was run.



In [633]:

    
y['judge']=1
y['judge'].iloc[np.where(np.array(y.PBA)==2)]=1 # This number assignment requires visually judging.
y['judge'].iloc[np.where(np.array(y.PBA)==13)]=3
y['judge'].iloc[np.where(np.array(y.PBA)==16)]=0
y['judge'].iloc[np.where(np.array(y.PBA)==26)]=2
y[y['judge']==y['assignment']].count()









    



C:\Users\thong\Anaconda3\lib\site-packages\pandas\core\indexing.py:132: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)






    Out[633]:





ELBTU         79
NGBTU         79
ELVNBTU       79
NGHTBTU       79
assignment    79
PBA           79
judge         79
dtype: int64



In [611]:

    
a=324/(139+434+23+58)



In [612]:

    
a









    Out[612]:





0.4954128440366973



In [ ]:

	REGION	CENDIV	PBA	FREESTN	SQFT	SQFTC	WLCNS	RFCNS	RFCOOL	RFTILT	...	FKCLBTU	FKWTBTU	FKCKBTU	FKOTBTU	DHHTBTU	DHCLBTU	DHWTBTU	DHCKBTU	DHOTBTU	PUBCLIM
PUBID
6716	3	5	14	1.0	108000	7	1	6	2	1	...	0.0	0.0	0.0	0.0	NaN	NaN	NaN	NaN	NaN	2
6717	3	7	5	1.0	1700	2	5	5	2	2	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	2
6718	2	3	26	1.0	2000	2	1	4	2	2	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1
6719	1	2	12	1.0	19250	4	1	4	2	1	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	2
6720	3	5	14	1.0	142000	7	1	1	1	2	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	2

	ELBTU	NGBTU	ELVNBTU	NGHTBTU
PUBID
9	59.424400	19.931467	17.621520	16.893707
11	42.062146	20.280000	18.361463	19.146244
18	10.216667	8.183000	2.971167	8.183000
20	37.380282	22.818521	14.557141	21.278775
36	30.185778	8.474478	13.816567	7.493933
51	41.638982	29.714506	14.336681	20.583518
69	111.319939	3.025333	33.627515	0.912000
100	25.897111	17.766667	7.408889	17.686444
108	20.873004	29.933220	6.394295	27.467557
115	44.951358	17.417264	9.733698	15.784358
126	23.904455	51.566818	7.272182	50.182545
127	27.743737	3.023232	6.886364	2.647980
136	35.718333	46.894167	8.124167	45.752500
144	15.735960	18.139394	5.893737	18.139394
152	10.443100	7.113500	2.025250	5.181050
165	352.486637	52.439644	61.461287	42.351813
175	34.545566	33.377292	7.076623	33.377292
222	16.480294	2.421816	4.573518	1.926827
235	4.984667	5.070400	2.002533	5.070400
237	63.594504	6.742274	10.754800	6.742274
248	42.843948	64.437343	9.268919	64.437343
264	9.204675	6.999475	2.561925	1.634537
282	92.075067	24.982667	29.381333	22.995467
293	43.000548	3.341918	12.905205	0.055342
297	98.516132	85.183974	28.013116	50.586268
299	3.289518	0.210000	1.287470	0.210000
300	20.850000	34.423333	5.026667	34.423333
306	58.323408	0.413944	15.147459	0.332946
315	89.741871	37.877167	30.037748	29.966045
325	41.693680	10.910120	12.637040	8.450040
...	...	...	...	...
5848	64.369375	6.918750	6.246875	6.640000
5849	57.696226	103.660377	4.659434	92.076604
5850	77.928125	406.156250	15.178125	21.138750
5851	7.815172	8.853966	0.614310	8.064828
5862	67.007342	13.052658	4.508608	13.052658
5892	42.649400	65.805000	6.659400	65.805000
5932	2.205531	6.393438	0.284469	3.015219
5993	25.293000	39.726095	2.705190	38.623524
6008	45.440969	98.860111	9.686133	98.282004
6059	1.691333	7.141000	0.154000	7.141000
6071	27.861905	15.863175	2.347778	15.863175
6117	43.682647	2.532353	3.482941	2.532353
6121	48.427692	40.369231	3.295897	40.369231
6123	24.650750	171.610750	6.232750	149.945000
6155	50.840800	44.300600	6.515800	44.300600
6162	19.844211	14.155789	1.085579	14.155789
6186	36.641948	32.107792	7.678753	32.107792
6189	14.070000	13.564333	1.071667	13.564333
6221	2.456500	60.885000	0.263000	60.885000
6223	21.434800	9.553000	1.835000	9.553000
6274	31.145769	39.202308	5.429269	38.919846
6315	293.539286	137.862857	20.013571	137.862857
6373	44.377778	10.790972	3.864583	10.463333
6378	21.296963	95.476852	1.466963	95.279963
6443	18.864103	48.201282	1.785128	48.201282
6481	17.153357	10.052357	2.103714	6.549000
6489	37.735964	42.591564	4.069527	42.019200
6526	13.083524	19.279810	2.185619	19.279810
6694	17.754971	30.111571	3.015971	30.111571
6718	104.144500	135.966500	4.339500	127.668000